{ Copyright (C) by Tobias Schwarz, 2003-2007
  Distributed under the GNU General Public License }

unit Split_Logfile;

interface

uses Classes, WildMatch, SysUtils, StrUtils, DateUtils,  YclZlib;
{The YclZlib Unit is part of the Y core library, which is copyright by
Peter J. Haas. Please visit http://delphi.pjh2.de/ for the most recent version
of the library, supporting documentation and license terms.}

type
   TPreprocessLog = class(TObject)
   private
      fSourcefile, fDestfile, fDomain, DomainII: string;
      LogLines: TStringList;
      GZipReader: TGZipReader;
      GZipWriter: TGZipWriter;
      Src, Dst: TFileStream;
      LogIsCompressed: Boolean;
      BufferSize: Integer;
      StartTime: TDateTime;
      procedure OpenLog;
      procedure ProcessLog;
      procedure WriteLog;
      procedure CloseLog;
      function ProcessBuffer(Str: String): String;
      function SplitLine(Line: Integer): String;
      function fRemoveHost(Line: Integer): String;
   public
      Provider, Hostname: String;
      SplitLog, Mix, Remove, CompressFilteredLog, RemoveHost: Boolean;
      Entries, Found, HostnamesFound, ProcessTime: Integer;
      constructor Create;
      procedure Free;
      procedure PreprocessLog(Sourcefile, Destfile, Domain: String);
   end;

implementation

const
  MaxBufferSize = 1024 * 1024;  // 1 MByte
  BufferBlockSize = 32 * 1024;


constructor TPreprocessLog.Create;
begin
inherited Create;
SplitLog:=false;
Provider:='1und1';
Mix:=true;
Remove:=true;
CompressFilteredLog:=false;
RemoveHost:=false; Hostname:='';
LogLines:=TStringList.Create;
end;

procedure TPreprocessLog.Free;
begin
LogLines.Free;
end;

procedure TPreprocessLog.PreprocessLog(Sourcefile, Destfile, Domain: String);
begin
StartTime:=GetTime;
fSourcefile:=Sourcefile;
If ExtractFileExt(fSourcefile)='.gz' then LogIsCompressed:=true else LogIsCompressed:=false;
fDestfile:=Destfile;
Entries:=0;

If SplitLog then begin
   Found:=0;
   fDomain:=Domain;
   If mix then begin
      If Copy(fdomain,1,4)='www.' then DomainII:=Copy(fdomain,5,Length(fdomain)) else DomainII:='www.'+fdomain;
   end else DomainII:='';
end;

If RemoveHost then begin
   HostnamesFound:=0;
   If Copy(Hostname,1,7)='http://' then Delete(Hostname,1,7);
   If Copy(Hostname,1,4)='www.' then Delete(Hostname,1,4);
   If Hostname[Length(Hostname)]='/' then Delete(Hostname,Length(Hostname),1);
end;

OpenLog;
ProcessLog;  
CloseLog;
ProcessTime:=MillisecondsBetween(GetTime, StartTime);
end;

procedure TPreprocessLog.OpenLog;
begin
Src := TFileStream.Create(fSourcefile, fmOpenRead or fmShareDenyWrite);
Dst := TFileStream.Create(fDestfile, fmCreate);
If Src.Size > (MaxBufferSize div 2) then BufferSize := MaxBufferSize  // Limit the buffer size
else BufferSize := ((Integer(Src.Size) * 2 + BufferBlockSize - 1) div BufferBlockSize) * BufferBlockSize;
If LogIsCompressed then GZipReader := TGZipReader.Create(Src, ZLibStreamDefaultBufferSize);
If CompressFilteredLog then GZipWriter := TGZipWriter.CreateDef(Dst);
end;

procedure TPreprocessLog.ProcessLog;
var Remains, Str: String;
    BlockSize: Integer;
    Buffer: PChar;
begin
Remains:='';
GetMem(Buffer, BufferSize);
try

   If LogIsCompressed then begin
      while not GZipReader.EndOfStream do begin
         BlockSize:=GZipReader.Read(Buffer^, BufferSize);
         Str:=Remains+Buffer;
         SetLength(Str,Length(Remains)+BlockSize);
         Remains:=ProcessBuffer(Str);
         WriteLog;
      end;
   end else begin
      Repeat
         BlockSize:=src.Read(Buffer^, BufferSize);
         Str:=Remains+Buffer;
         SetLength(Str,Length(Remains)+BlockSize);
         Remains:=ProcessBuffer(Str);
         WriteLog;
      until BlockSize<BufferSize;
   end;

   If Remains<>'' then begin
      Inc(Entries);
      LogLines.Text:=Remains;
      If SplitLog then LogLines.Strings[0]:=SplitLine(0);
      If LogLines.Strings[0]<>'' then begin
         If RemoveHost then LogLines.Strings[0]:=fRemoveHost(0);
         Inc(Found);
         WriteLog;
      end;
   end;

finally
   FreeMem(Buffer, BufferSize);
end;
end;

function TPreprocessLog.ProcessBuffer(Str: String): String;
var i: Integer;
begin
LogLines.Text:=Str;
Inc(Entries,LogLines.Count-1);
i:=0;
While i<LogLines.Count-1 do begin
   If SplitLog then LogLines.Strings[i]:=SplitLine(i);
   If LogLines.Strings[i]='' then LogLines.Delete(i) else begin
      If RemoveHost then LogLines.Strings[i]:=fRemoveHost(i);
      Inc(i);
   end;
end;
Inc(Found,LogLines.Count-1);
Result:=LogLines.Strings[LogLines.Count-1];
LogLines.Delete(LogLines.Count-1);
end;

procedure TPreprocessLog.WriteLog;
begin  
If CompressFilteredLog then begin
   GZipWriter.WriteBuffer(PChar(LogLines.Text)^,Length(LogLines.Text));
end else begin
   Dst.WriteBuffer(PChar(LogLines.Text)^,Length(LogLines.Text));
end;
end;

procedure TPreprocessLog.CloseLog;
begin
If LogIsCompressed then GZipReader.Free;
If CompressFilteredLog then GZipWriter.Free;
Src.Free;
Dst.Free;
end;

function TPreprocessLog.SplitLine(line: integer): string;
var DomFound: Shortstring;
    i, j: Integer;
begin
Result:=LogLines.Strings[line];
i:=Pos('" ',Result); { Ende des "Request" Eintrags }
i:=PosEx(' ',Result,i+2); { Leerzeichen nach "Status" Eintrag }
i:=PosEx(' ',Result,i+1); { Leerzeichen nach "Bytes" Eintrag }
j:=PosEx(' ',Result,i+1);
If Provider='1und1' then begin { Bei 1und1 gibt j jetzt das Leerzeichen nach Eintrag der Domain an }
   DomFound:=Copy(Result,i+1,j-i-1);
   If (Matching(DomFound,fdomain)) or (Matching(DomFound,DomainII)) then begin
      If Remove then Delete(Result,i+1,j-i);
   end else Result:='';
end else if Provider='hosteurope' then begin  { Bei Hosteurope Leerzeichen nach Eintrag des Referrers }
   i:=PosEx('" "',Result,j+1); { Zwischen "userclient" und "Domain" Eintrag }
   DomFound:=Copy(Result,i+3,Length(Result)-i-3);
   If (Matching(DomFound,fdomain)) or (Matching(DomFound,DomainII)) then begin
      If Remove then Delete(Result,i+1,Length(Result)-i);
   end else Result:='';
end;
end;

function TPreprocessLog.fRemoveHost(line: integer): string;
var i: integer;
begin
Result:=LogLines.Strings[line];
i:=Pos('] "',Result); { Anfang des "Request" Eintrags }
i:=PosEx(' ',Result,i+3); { Leerzeichen nach Anforderung }
If Matching(Copy(Result,i+1,7),'http://') then Delete(Result,i+1,7);
If Matching(Copy(Result,i+1,4),'www.') then Delete(Result,i+1,4);
If Matching(Copy(Result,i+1,Length(Hostname)),Hostname) then begin
   Delete(Result,i+1,Length(Hostname));
   Inc(HostnamesFound);
end;
end;

end.
